library(tidyverse)
library(caret)

Data

Numbers divided by quartiles in order to determine the profiles of rankings that are slow and fast. The profiles are divided in four equal parts considering the number of alternatives and one quarter is taking as fast and the other considered as slow.

Storing counts in `nn`, as `n` already present in input
ℹ Use `name = "new_name"` to pick a new name.
Using alpha for a discrete variable is not advised.
# For the reggresion problem
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 5,
#   repeats = 2)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)

Predicting execution time for profiles of a fixed size

Joining, by = c("n", "m", "id")
# Fit control para 
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)

Training 25%-75%

# Da muy malos resultados porque está desbalanceado
# # Para n = 10
# totrain <- data_quartiles %>% 
#   filter(n==10) %>% 
#   mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
#   select(starts_with("mu"), quartile) 
# set.seed(123)
# trainIndex <- createDataPartition(totrain$quartile, p = .8, 
#                                   list = FALSE, 
#                                   times = 1)
# dataTrain <- totrain[ trainIndex,]
# dataTest  <- totrain[-trainIndex,]
# set.seed(123)
# mclas_rf_10 <- train(
#   quartile ~., data = dataTrain, 
#   method = "rf",
#   tuneLength = 3,
#   trControl = fitControl,
#   metric = "AUC"
# )
# mclas_rf_10
library(ROSE)

Para n = 10 con los datos normalizados

Y en test:

pred <- predict(rf_10_rose_norm, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")

Y sin normalizar:

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary,
  sampling = "up")
Error in trainControl(method = "repeatedcv", number = 5, repeats = 2,  : 
  no se pudo encontrar la función "trainControl"
rf_10_rose
Random Forest 

2240 samples
  16 predictor
   2 classes: 'fast', 'slow' 

No pre-processing
Resampling: Cross-Validated (5 fold, repeated 2 times) 
Summary of sample sizes: 1792, 1792, 1792, 1792, 1792, 1792, ... 
Resampling results across tuning parameters:

  mtry  AUC        Precision  Recall     F        
   2    0.7875077  0.7439379  0.7243243  0.7336813
   9    0.7803522  0.7403700  0.7391892  0.7394959
  16    0.7768513  0.7310205  0.7436937  0.7371480

AUC was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
confusionMatrix(data = pred, reference = (dataTest %>% filter(quartile == "fast"))$quartile, mode = "prec_recall")
Confusion Matrix and Statistics

          Reference
Prediction fast slow
      fast  125    0
      slow   15    0
                                          
               Accuracy : 0.8929          
                 95% CI : (0.8294, 0.9388)
    No Information Rate : 1               
    P-Value [Acc > NIR] : 1.0000000       
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : 0.0003006       
                                          
              Precision : 1.0000          
                 Recall : 0.8929          
                     F1 : 0.9434          
             Prevalence : 1.0000          
         Detection Rate : 0.8929          
   Detection Prevalence : 0.8929          
      Balanced Accuracy :     NA          
                                          
       'Positive' Class : fast            
                                          
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
Error in ROSE(quartile ~ ., data = totrain) : 
  no se pudo encontrar la función "ROSE"

Comparación de las variables más importantes

(vip_mreg_rf_8_norm + vip_mreg_rf_9_norm + vip_mreg_rf_10_norm) |
(vip_mreg_rf_8 + vip_mreg_rf_9 + vip_mreg_rf_10)
totrain <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n!=10)
totest <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n==10)

set.seed(123)
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
print(table(rose_train$quartile))

fast slow 
4146 4254 
set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]

Seeking for the outliers

data_outliers_normalized <- left_join(data_normalized %>% 
                               mutate(id = as.double(as.character(id))), outliers) 
Joining, by = c("n", "m", "id")
fitControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  sampling = "down")

data_outliers_normalized
totrain <- data_outliers_normalized %>% 
  filter(n==8) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_8_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_8_outlier <- vip(rf_8_outlier)
pred <- predict(rf_8_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction yes  no
       yes  27 109
       no    9 415
                                          
               Accuracy : 0.7893          
                 95% CI : (0.7531, 0.8224)
    No Information Rate : 0.9357          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.2363          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.75000         
            Specificity : 0.79198         
         Pos Pred Value : 0.19853         
         Neg Pred Value : 0.97877         
             Prevalence : 0.06429         
         Detection Rate : 0.04821         
   Detection Prevalence : 0.24286         
      Balanced Accuracy : 0.77099         
                                          
       'Positive' Class : yes             
                                          
sens_rf_8_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_8_outlier, dataTest, type= "prob")
auc_rf_8_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
totrain <- data_outliers_normalized %>% 
  filter(n==9) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_9_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_9_outlier <- vip(rf_9_outlier)
pred <- predict(rf_9_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction yes  no
       yes  35  99
       no    9 416
                                          
               Accuracy : 0.8068          
                 95% CI : (0.7716, 0.8387)
    No Information Rate : 0.9213          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.3117          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.79545         
            Specificity : 0.80777         
         Pos Pred Value : 0.26119         
         Neg Pred Value : 0.97882         
             Prevalence : 0.07871         
         Detection Rate : 0.06261         
   Detection Prevalence : 0.23971         
      Balanced Accuracy : 0.80161         
                                          
       'Positive' Class : yes             
                                          
sens_rf_9_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_9_outlier, dataTest, type= "prob")
auc_rf_9_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
totrain <- data_outliers_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_10_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_10_outlier <- vip(rf_10_outlier)
pred <- predict(rf_10_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction yes  no
       yes  34 139
       no    7 379
                                          
               Accuracy : 0.7388          
                 95% CI : (0.7003, 0.7748)
    No Information Rate : 0.9267          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.226           
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.82927         
            Specificity : 0.73166         
         Pos Pred Value : 0.19653         
         Neg Pred Value : 0.98187         
             Prevalence : 0.07335         
         Detection Rate : 0.06082         
   Detection Prevalence : 0.30948         
      Balanced Accuracy : 0.78046         
                                          
       'Positive' Class : yes             
                                          
sens_rf_10_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_10_outlier, dataTest, type= "prob")
auc_rf_10_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
vip_rf_8_outlier + vip_rf_9_outlier + vip_rf_10_outlier

totrain <- data_outliers_normalized %>% 
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_all_outlier <- train(
  #outlier ~., data = dataTrain, method = "rf",
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 8,
  trControl = fitControl,
  metric = "ROC"
)

pred <- predict(rf_all_outlier, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
 Accuracy     Kappa 
0.7641453 0.2541812 
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction  yes   no
       yes  102  377
       no    19 1181
                                          
               Accuracy : 0.7641          
                 95% CI : (0.7431, 0.7843)
    No Information Rate : 0.9279          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.2542          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.84298         
            Specificity : 0.75802         
         Pos Pred Value : 0.21294         
         Neg Pred Value : 0.98417         
             Prevalence : 0.07207         
         Detection Rate : 0.06075         
   Detection Prevalence : 0.28529         
      Balanced Accuracy : 0.80050         
                                          
       'Positive' Class : yes             
                                          
sens_rf_all_outlier <- sensitivity(pred, dataTest$outlier)

vip(rf_all_outlier)


pred <- predict(rf_all_outlier, dataTest, type= "prob")
auc_rf_all_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
vip(rf_all_outlier, 
    horizontal = FALSE,
    aesthetics = list(width = .5)) +
  theme_bw() + 
  scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
  ylab("Variable\nimportance") +
  theme(text=element_text(size = 12, family="Times New Roman"),
        axis.title.x = element_text(margin = margin(t = 10)))
  
sens_rf_8_outlier
[1] 0.75
auc_rf_8_outlier
[1] 0.8313454
sens_rf_9_outlier
[1] 0.7954545
auc_rf_9_outlier
[1] 0.8796778
sens_rf_10_outlier
[1] 0.8292683
auc_rf_10_outlier
[1] 0.8204869
sens_rf_all_outlier
[1] 0.8429752
auc_rf_all_outlier
[1] 0.863586
confusionMatrix(data = pred, reference = totest$outlier, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction  yes   no
       yes  169  811
       no    38 1782
                                          
               Accuracy : 0.6968          
                 95% CI : (0.6794, 0.7138)
    No Information Rate : 0.9261          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.1853          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.81643         
            Specificity : 0.68723         
         Pos Pred Value : 0.17245         
         Neg Pred Value : 0.97912         
             Prevalence : 0.07393         
         Detection Rate : 0.06036         
   Detection Prevalence : 0.35000         
      Balanced Accuracy : 0.75183         
                                          
       'Positive' Class : yes             
                                          

Ahora para los cuartiles

totrain <- data_quartiles_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))

totest <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
  
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_quartiles <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
) 

pred <- predict(rf_quartiles, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
 Accuracy     Kappa 
0.6839286 0.3257143 
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles, totest)
postResample(pred = pred, obs = totest$quartile)
 Accuracy     Kappa 
0.6967857 0.3713439 
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")
Confusion Matrix and Statistics

          Reference
Prediction fast slow
      fast  576  725
      slow  124 1375
                                          
               Accuracy : 0.6968          
                 95% CI : (0.6794, 0.7138)
    No Information Rate : 0.75            
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.3713          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.8229          
            Specificity : 0.6548          
         Pos Pred Value : 0.4427          
         Neg Pred Value : 0.9173          
             Prevalence : 0.2500          
         Detection Rate : 0.2057          
   Detection Prevalence : 0.4646          
      Balanced Accuracy : 0.7388          
                                          
       'Positive' Class : fast            
                                          
pred <- predict(rf_quartiles, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))
[1] 0.8010837
vip(rf_quartiles)

---
title: "Training"
output: html_notebook
---

```{r}
library(tidyverse)
library(caret)
```

# Data

```{r echo=FALSE}
ggplot(times, aes(x=as.numeric(m),y=exec_time,color=quartile,group=m)) + 
  geom_jitter(height = 0) +
  geom_hline(aes(yintercept = mean(exec_time))) +
  geom_hline(aes(yintercept = median(exec_time)), linetype = "dashed") +
  facet_wrap(~n, scales = "free_x") +
  coord_flip() +
  xlab("") + ylab("") +
  theme_light()
```

```{r}
ggplot(times, aes(exec_time)) +
  geom_boxplot() +
  facet_wrap(n~., scales = "free_x", nrow = 3, strip.position = "right")  +
  theme_bw() +
  scale_x_continuous(n.breaks = 10) +
  xlab("") + ylab("") +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank())
```

Numbers divided by quartiles in order to determine the profiles of rankings that are slow and fast. The profiles are divided in four equal parts considering the number of alternatives and one quarter is taking as fast and the other considered as slow.

```{r echo=FALSE}
ggplot(times %>% count(n,m,quartile), aes(m,nn,fill=quartile)) +
  geom_bar(aes(alpha = as.numeric(as.character(m))%%2==0),stat="identity", position="fill") +
  geom_text(aes(label=nn),position = position_fill(vjust = 0.5), angle = 90) +
  facet_grid(.~n) +
  geom_hline(yintercept = .75) +
  geom_hline(yintercept = .5) +
  geom_hline(yintercept = .25) +
  scale_alpha_discrete(range = c(0.6,1)) +
  theme_bw() +
  theme(legend.position = "none")
```
```{r}
# For the reggresion problem
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 5,
#   repeats = 2)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)
```

# Predicting execution time for profiles of a fixed size

```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(as.character(id))), times) 
data_quartiles
```

```{r echo=FALSE}
data_quartiles_normalized <- left_join(data_normalized %>% 
                               mutate(id = as.double(as.character(id))), times) 
data_quartiles_normalized
```

```{r}
# Fit control para 
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary)
```

Training 25%-75%

```{r}
# Da muy malos resultados porque está desbalanceado
# # Para n = 10
# totrain <- data_quartiles %>% 
#   filter(n==10) %>% 
#   mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
#   select(starts_with("mu"), quartile) 
# set.seed(123)
# trainIndex <- createDataPartition(totrain$quartile, p = .8, 
#                                   list = FALSE, 
#                                   times = 1)
# dataTrain <- totrain[ trainIndex,]
# dataTest  <- totrain[-trainIndex,]
# set.seed(123)
# mclas_rf_10 <- train(
#   quartile ~., data = dataTrain, 
#   method = "rf",
#   tuneLength = 3,
#   trControl = fitControl,
#   metric = "AUC"
# )
# mclas_rf_10
```

```{r}
library(ROSE)
```

Para n = 10 con los datos normalizados

```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
# print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]

set.seed(123)
rf_10_rose_norm <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose_norm <- vip(rf_10_rose_norm)
confusionMatrix(rf_10_rose_norm)
```


Y en test:

```{r}
pred <- predict(rf_10_rose_norm, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")
```

Y sin normalizar:

```{r}
fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 2,
  classProbs = TRUE,
  summaryFunction = prSummary,
  sampling = "up")

data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

rf_10_rose <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)
rf_10_rose
```




```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = dataTrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))


set.seed(123)
rf_10_rose <- train(
  quartile ~., data = rose_train, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose <- vip(rf_10_rose)
confusionMatrix(rf_10_rose)
```

```{r}
pred <- predict(rf_10_rose, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
confusionMatrix(data = pred, reference = dataTest$quartile, mode = "prec_recall")
```






```{r}
data_quartiles <- left_join(predict_times %>% 
                              mutate(id = as.double(id)), times) 

totrain <- data_quartiles %>% 
  filter(n==10) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile)
  
set.seed(123) 
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
# print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]

set.seed(123)
rf_10_rose <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 3,
  trControl = fitControl,
  metric = "AUC"
)

vip_rf_10_rose <- vip(rf_10_rose)
confusionMatrix(rf_10_rose)
```

```{r}
set.seed(123)
rpart_10_rose_norm <- train(
  quartile ~., data = dataTrain, method = "nnet",
  tuneLength = 3,
  trControl = fitControl,
  preProcess = c("center","scale"),
  metric = "AUC"
)
```




Comparación de las variables más importantes

```{r}
(vip_mreg_rf_8_norm + vip_mreg_rf_9_norm + vip_mreg_rf_10_norm) |
(vip_mreg_rf_8 + vip_mreg_rf_9 + vip_mreg_rf_10)
```










```{r}
totrain <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n!=10)
totest <- data_quartiles %>% 
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4"))) %>%
  select(starts_with("mu"), quartile) %>%
  filter(n==10)

set.seed(123)
rose_train <- ROSE(quartile ~ ., data  = totrain)$data %>%
  mutate(quartile = fct_relevel(quartile, "slow", after = Inf))
print(table(rose_train$quartile))

set.seed(123)
trainIndex <- createDataPartition(rose_train$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)
dataTrain <- rose_train[ trainIndex,]
dataTest  <- rose_train[-trainIndex,]
```


# Seeking for the outliers

```{r}
data_outliers_normalized <- left_join(data_normalized %>% 
                               mutate(id = as.double(as.character(id))), outliers) 

fitControl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  sampling = "down")

data_outliers_normalized
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==8) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_8_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_8_outlier <- vip(rf_8_outlier)
pred <- predict(rf_8_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_8_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_8_outlier, dataTest, type= "prob")
auc_rf_8_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==9) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_9_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_9_outlier <- vip(rf_9_outlier)
pred <- predict(rf_9_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_9_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_9_outlier, dataTest, type= "prob")
auc_rf_9_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
totrain <- data_outliers_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_10_outlier <- train(
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
)

vip_rf_10_outlier <- vip(rf_10_outlier)
pred <- predict(rf_10_outlier, dataTest)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_10_outlier <- sensitivity(pred, dataTest$outlier)
pred <- predict(rf_10_outlier, dataTest, type= "prob")
auc_rf_10_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
vip_rf_8_outlier + vip_rf_9_outlier + vip_rf_10_outlier
```


```{r}
totrain <- data_outliers_normalized %>% 
  select(starts_with("mu"), outlier)
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_all_outlier <- train(
  #outlier ~., data = dataTrain, method = "rf",
  outlier ~., data = dataTrain, method = "rf",
  tuneLength = 8,
  trControl = fitControl,
  metric = "ROC"
)

pred <- predict(rf_all_outlier, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
sens_rf_all_outlier <- sensitivity(pred, dataTest$outlier)

vip(rf_all_outlier)

pred <- predict(rf_all_outlier, dataTest, type= "prob")
auc_rf_all_outlier <- AUC(pred$yes, ifelse(dataTest$outlier == "yes", 1, 0))
```

```{r}
vip(rf_all_outlier, 
    horizontal = FALSE,
    aesthetics = list(width = .5)) +
  theme_bw() + 
  scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
  ylab("Variable\nimportance") +
  theme(text=element_text(size = 12, family="Times New Roman"),
        axis.title.x = element_text(margin = margin(t = 10)))
  
```


```{r}
sens_rf_8_outlier
auc_rf_8_outlier
sens_rf_9_outlier
auc_rf_9_outlier
sens_rf_10_outlier
auc_rf_10_outlier
sens_rf_all_outlier
auc_rf_all_outlier
```


```{r}
# fitControl <- trainControl(
#   method = "repeatedcv",
#   number = 3,
#   repeats = 5,
#   classProbs = TRUE,
#   summaryFunction = twoClassSummary,
#   sampling = "down")

totrain <- data_outliers_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), outlier)

totest <- data_outliers_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), outlier) 
  
set.seed(123)
trainIndex <- createDataPartition(totrain$outlier, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_all_outlier2 <- train(
  outlier ~., data = dataTrain, method = "rf", # 85 with rpart2
  tuneLength = 8,
  trControl = fitControl,
  metric = "ROC"
)

pred <- predict(rf_all_outlier2, dataTest)
postResample(pred = pred, obs = dataTest$outlier)
confusionMatrix(data = pred, reference = dataTest$outlier, mode = "sens_spec")
pred <- predict(rf_all_outlier2, totest)
postResample(pred = pred, obs = totest$outlier)
confusionMatrix(data = pred, reference = totest$outlier, mode = "sens_spec")


pred <- predict(rf_all_outlier2, totest, type= "prob")
AUC(pred$yes, ifelse(totest$outlier == "yes", 1, 0))

vip(rf_all_outlier2)
```


# Ahora para los cuartiles


```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))

totest <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
  
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_quartiles <- train(
  quartile ~., data = dataTrain, method = "rf",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
) 

pred <- predict(rf_quartiles, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles, totest)
postResample(pred = pred, obs = totest$quartile)
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")

pred <- predict(rf_quartiles, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))

vip(rf_quartiles)
```

```{r}
totrain <- data_quartiles_normalized %>% 
  filter(n!=10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))

totest <- data_quartiles_normalized %>% 
  filter(n==10) %>%
  select(starts_with("mu"), quartile) %>%
  mutate(quartile = fct_collapse(quartile, fast = c("q1"), slow = c("q2","q3","q4")))
  
set.seed(123)
trainIndex <- createDataPartition(totrain$quartile, p = .8, 
                                  list = FALSE, 
                                  times = 1)

dataTrain <- totrain[ trainIndex,]
dataTest  <- totrain[-trainIndex,]

set.seed(123)
rf_quartiles2 <- train(
  quartile ~., data = dataTrain, method = "rpart2",
  tuneLength = 10,
  trControl = fitControl,
  metric = "ROC"
) 

pred <- predict(rf_quartiles2, dataTest)
postResample(pred = pred, obs = dataTest$quartile)
# confusionMatrix(data = pred, reference = dataTest$quartile, mode = "sens_spec")
pred <- predict(rf_quartiles2, totest)
postResample(pred = pred, obs = totest$quartile)
confusionMatrix(data = pred, reference = totest$quartile, mode = "sens_spec")

pred <- predict(rf_quartiles2, totest, type= "prob")
AUC(pred$fast, ifelse(totest$quartile == "fast", 1, 0))

vip(rf_quartiles2)
```

```{r}
vip(rf_quartiles2, 
    num_features = 5,
    aesthetics = list(width = .5)) +
  theme_bw() + 
  scale_x_discrete(labels = function(x) parse(text=paste0("mu[", str_remove(x, "mu"), "]"))) +
  ylab("Variable importance") +
  theme(text=element_text(size = 12, family="Times New Roman"),
        axis.title.x = element_text(margin = margin(t = 10)))
```

